summary of plan

we investigate the effect of diet composition on different measures of obesity.

Obesity can be measured in multiple ways

But each of these may categorise obesity differently.

In terms of diet composition, we look at

  • daily servings of meat
  • daily servings of veg
  • daily servings of dairy
  • daily servings of fruit
  • daily servings of grains

we then want to see what is the effect of each food type on a measurement for obesity (whether it is significant or not), and compare how this varies across each obesity measurement.

e.g. Maybe meat servings is a significant contributor to BMI but not to waist circumference.

Then, we can compare to see the effect of each food type across different obesity measures. Examples

e.g. Does fruit servings have a bigger effect on BMI, waist to height ratio, or waist circumference which of the food types has the biggest contribution to BMI reduction/gain.

We can do this by calculating and comparing the standardized betas.

(we can adjust for obvious contributors to each measurement such as age and sex).

exec summary of findings

  • dairy is insignificant conributor to all measures of obesity
# selecting macros
foods = tech_nutr %>% dplyr::select(ABSPID,
                             VEGLEG1N, 
                             VEGLEG2N,
                              FRUIT1N,
                              FRUIT2N,
                              DAIRY1N,
                              DAIRY2N,
                              MEAT1N,
                              MEAT2N,
                              GRAINS1N,
                              GRAINS2N)

# gettin average of macros
avg_veges <- rowMeans(foods[ , c(2,3)], na.rm=TRUE)
avg_fruit <- rowMeans(foods[ , c(4,5)], na.rm=TRUE)
avg_dairy <- rowMeans(foods[ , c(6,7)], na.rm=TRUE)
avg_meat <- rowMeans(foods[ , c(8,9)], na.rm=TRUE)
avg_grains <- rowMeans(foods[ , c(10,11)], na.rm=TRUE)


dat <- cbind(tech_nutr$ABSPID, 
             avg_veges,
             avg_fruit,
             avg_dairy,
             avg_meat,
             avg_grains)

dat <- as_tibble(dat)
colnames(dat)[1] <- "ABSPID"

dat$avg_veges <- as.numeric(dat$avg_veges)
dat$avg_fruit <- as.numeric(dat$avg_fruit)
dat$avg_dairy <- as.numeric(dat$avg_dairy)
dat$avg_meat <- as.numeric(dat$avg_meat)
dat$avg_grains <- as.numeric(dat$avg_grains)



tech_biom1 = tech_biom %>% dplyr::select(c(1:53))

final = merge(dat, tech_biom1, by = "ABSPID")
final = final %>% dplyr::select(avg_veges,
                         avg_fruit,
                         avg_dairy,
                         avg_meat,
                         avg_grains,
                         BMISC,
                         AGEC,
                         SEX,
                         PHDKGWBC,
                         PHDCMHBC,
                         PHDCMWBC,
                         SF2SA1QN)


final$w2hratio = (final$PHDCMWBC)/(final$PHDCMHBC)

# to make results more robust, focus on adults only

final <- final %>% filter(AGEC > 17)

final = final %>% na.omit()

IDA on varibales in df

numeric_hist <- function(data, x) {
  ggplot(data, aes_string(x = `x`)) +
  geom_histogram()
}


numeric_hist(final, x = "BMISC")

numeric_hist(final, x = "avg_dairy")

numeric_hist(final, x = "avg_fruit")

numeric_hist(final, x = "avg_meat")

numeric_hist(final, x = "avg_grains")

numeric_hist(final, x = "avg_veges")

numeric_hist(final, x = "AGEC")

short analysis

linear regression with BMI as outcome

bmi_full <- lm(BMISC ~ avg_veges + avg_fruit + avg_dairy + avg_meat + avg_grains + AGEC + SEX, dat = final)
bmi_null <- lm(BMISC ~ 1, dat = final)

linear regression with waist to heigh ratio as outcome

w2hratio_full <- lm(w2hratio ~ avg_veges + avg_fruit + avg_dairy + avg_meat + avg_grains + AGEC + SEX, dat = final)
w2hratio_null <- lm(w2hratio ~ 1, dat = final)

linear regression with waist circumference as outcome

waist_full <- lm(PHDCMWBC ~ avg_veges + avg_fruit + avg_dairy + avg_meat + avg_grains + AGEC + SEX, dat = final) 
waist_null <- lm(PHDCMWBC ~ 1, dat = final) 

NOTE: robust regression was attempted and gave similar results to OLS, so just using OLS instead.

table of all models

tab_model(bmi_full, waist_full, w2hratio_full, show.ci = F, show.se = T,
            dv.labels = c("BMI", "Waist circumf.", "waist to height ratio"))
  BMI Waist circumf. waist to height ratio
Predictors Estimates std. Error p Estimates std. Error p Estimates std. Error p
(Intercept) 25.98 0.24 <0.001 89.07 0.59 <0.001 0.49 0.00 <0.001
avg veges -0.12 0.03 <0.001 -0.29 0.08 <0.001 -0.00 0.00 <0.001
avg fruit -0.20 0.05 <0.001 -0.57 0.11 <0.001 -0.00 0.00 <0.001
avg dairy 0.10 0.07 0.141 0.18 0.16 0.255 -0.00 0.00 0.275
avg meat 0.22 0.05 <0.001 0.37 0.12 0.002 0.00 0.00 0.007
avg grains -0.17 0.03 <0.001 -0.38 0.07 <0.001 -0.00 0.00 <0.001
AGEC 0.05 0.00 <0.001 0.23 0.01 <0.001 0.00 0.00 <0.001
SEX [2] -0.66 0.13 <0.001 -10.08 0.31 <0.001 -0.02 0.00 <0.001
Observations 7811 7811 7811
R2 / R2 adjusted 0.041 / 0.040 0.190 / 0.190 0.155 / 0.154

dairy is insignificant for all

Model selection and performance (10 fold CV)

Model selection using backward and forward selection

BMI

n = 9979

# bmi_AIC is just full model
# bmi_AIC <- step(bmi_full, direction = "backward", trace = F)

bmi_BIC <- step(bmi_full, direction = "backward", trace = F, k = log(n))

# fwd AIC model same as back
# step(bmi_null, scope = list(lower = bmi_null, upper = bmi_full), direction = "forward", trace = F)

waist circumeference

waist_AIC <- step(waist_full, direction = "backward", trace = F)

# waist_BIC same as AIC
# waist_BIC <- step(waist_full, direction = "backward", trace = F, k = log(n))


# backward selection same as fwd
# step(waist_null, scope = list(lower = waist_null, upper = waist_full), direction = "forward", trace = F)

w2h ratio

w2hratio_AIC <- step(w2hratio_full, direction = "backward", trace = F)
w2hratio_BIC <- step(w2hratio_full, direction = "backward", trace = F, k = log(n))

# backward selection same as fwd
# step(w2hratio_null, scope = list(lower = w2hratio_null, upper = waist_full), direction = "forward", trace = F)
params = trainControl(method = "cv", number = 10, verboseIter = FALSE)

set.seed(2021)

cv_objects = list(
  bmi_full = train(BMISC ~ avg_veges + avg_fruit + avg_dairy + avg_meat +
    avg_grains + AGEC + SEX, 
    method = "lm", 
    data = final,
    trControl = params),
  bmi_BIC = train(BMISC ~ avg_veges + avg_fruit + avg_meat + avg_grains + 
    AGEC + SEX,
    method = "lm", 
    data = final,
    trControl = params),
  waist_full = train(PHDCMWBC ~ avg_veges + avg_fruit + avg_dairy + avg_meat + avg_grains + AGEC + SEX,
    method = "lm", 
    data = final,
    trControl = params),  
  waist_AIC = train(PHDCMWBC ~ avg_veges + avg_fruit + avg_meat + avg_grains + AGEC + SEX,
    method = "lm", 
    data = final,
    trControl = params), 
  w2hratio_full = train(w2hratio ~ avg_veges + avg_fruit + avg_dairy + avg_meat + avg_grains + AGEC + SEX,
    method = "lm", 
    data = final,
    trControl = params),   
  w2hratio_AIC = train(w2hratio ~ avg_veges + avg_fruit + avg_meat + avg_grains + AGEC + SEX,
    method = "lm", 
    data = final,
    trControl = params),   
  w2hratio_BIC = train(w2hratio ~ avg_veges + avg_fruit + avg_grains + 
    AGEC + SEX,
    method = "lm", 
    data = final,
    trControl = params)   
)

cv_results_bmi = resamples(cv_objects[1:2], metric = "Rsquared")
ggplot(cv_results_bmi) +
  theme_bw() +
  labs(x = "Models", y = "Mean Absolute Error", title = "10-Fold CV Performance")

cv_results_waist = resamples(cv_objects[3:4], metric = "Rsquared")
ggplot(cv_results_waist) +
  theme_bw() +
  labs(x = "Models", y = "Mean Absolute Error", title = "10-Fold CV Performance")

cv_results_w2hratio = resamples(cv_objects[5:7], metric = "Rsquared")
ggplot(cv_results_w2hratio) +
  theme_bw() +
  labs(x = "Models", y = "Mean Absolute Error", title = "10-Fold CV Performance")

Regression assumptions

First model (full BMI)

bmi_full <- lm(BMISC ~ avg_veges + avg_fruit + avg_dairy + avg_meat +
  avg_grains + AGEC + SEX, 
  method = "lm", 
  data = final,
  trControl = params)
summary(bmi_full)
## 
## Call:
## lm(formula = BMISC ~ avg_veges + avg_fruit + avg_dairy + avg_meat + 
##     avg_grains + AGEC + SEX, data = final, method = "lm", trControl = params)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -13.835  -3.793  -0.854   2.789  35.088 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 25.975387   0.243339 106.746  < 2e-16 ***
## avg_veges   -0.124598   0.032734  -3.806 0.000142 ***
## avg_fruit   -0.198827   0.046527  -4.273 1.95e-05 ***
## avg_dairy    0.097887   0.066491   1.472 0.141009    
## avg_meat     0.216046   0.049438   4.370 1.26e-05 ***
## avg_grains  -0.174983   0.028362  -6.170 7.19e-10 ***
## AGEC         0.053893   0.003592  15.005  < 2e-16 ***
## SEX2        -0.663969   0.127662  -5.201 2.03e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.445 on 7803 degrees of freedom
## Multiple R-squared:  0.04127,    Adjusted R-squared:  0.04041 
## F-statistic: 47.99 on 7 and 7803 DF,  p-value: < 2.2e-16
model.diag.metrics <- augment(bmi_full)
head(model.diag.metrics)
## # A tibble: 6 × 15
##   .rownames BMISC avg_veges avg_fruit avg_dairy avg_meat avg_grains  AGEC SEX  
##   <chr>     <dbl>     <dbl>     <dbl>     <dbl>    <dbl>      <dbl> <int> <fct>
## 1 1          20.9     3.62      0.412      2.51     4.96       6.11    46 2    
## 2 2          29.8     2.95      2.03       1.44     2.19       3.45    77 1    
## 3 3          24.4     9.08      1.34       1.87     3.04       2.45    55 2    
## 4 4          21.4     5.87      5.06       3.64     2.79       2.35    59 2    
## 5 6          25.1     4.54      0.154      1.59     1.56       1.67    35 2    
## 6 8          30.7     0.706     0          1.77     1.59       3.56    35 1    
## # … with 6 more variables: .fitted <dbl>, .resid <dbl>, .hat <dbl>,
## #   .sigma <dbl>, .cooksd <dbl>, .std.resid <dbl>
plot1 <- ggplot(model.diag.metrics, aes(avg_meat, BMISC)) +
  geom_point() +
  stat_smooth(method = lm, se = FALSE) +
  geom_segment(aes(xend = avg_meat, yend = .fitted), color = "red", size = 0.3)
plot2 <- ggplot(model.diag.metrics, aes(avg_dairy, BMISC)) +
  geom_point() +
  stat_smooth(method = lm, se = FALSE) +
  geom_segment(aes(xend = avg_dairy, yend = .fitted), color = "red", size = 0.3)
plot3 <- ggplot(model.diag.metrics, aes(avg_grains, BMISC)) +
  geom_point() +
  stat_smooth(method = lm, se = FALSE) +
  geom_segment(aes(xend = avg_grains, yend = .fitted), color = "red", size = 0.3)
plot4 <- ggplot(model.diag.metrics, aes(avg_veges, BMISC)) +
  geom_point() +
  stat_smooth(method = lm, se = FALSE) +
  geom_segment(aes(xend = avg_veges, yend = .fitted), color = "red", size = 0.3)


grid.arrange(plot1, plot2, plot3, plot4, ncol=2)

The residuals mostly seem to follow the same pattern across features

par(mfrow=c(2,2))
plot(bmi_full)

Residuals vs. fitted is relatively linear, indicating a linear relationship. Normal Q-Q plot, standardised residuals follows closely the normal line, indicating normality of residuals. Scale-location has no obvious non-linear pattern, indicating homoscedasticity. In the residuals vs. leverage graph a number of points are outside of Cook’s distance, indicating they are outliers. Some outlier effects may exist in the regression models specified above.

Comparing standardized betas

# creating scaled df
sex <- final$SEX
socioeconomic <- final$SF2SA1QN
scaled_df <- final %>% 
  dplyr::select(-c(SEX, SF2SA1QN)) %>% 
  scale() %>% as.data.frame() 


scaled_df$SEX <- sex
scaled_df$SF2SA1QN <- socioeconomic

# lm scaled to get standardized betas to compare
bmi_scaled <- lm(BMISC ~ avg_veges + avg_fruit + avg_dairy + avg_meat + avg_grains + AGEC + SEX, dat = scaled_df)
w2h_scaled <- lm(w2hratio ~ avg_veges + avg_fruit + avg_dairy + avg_meat + avg_grains + AGEC + SEX, dat = scaled_df)
waist_scaled <- lm(PHDCMWBC ~ avg_veges + avg_fruit + avg_dairy + avg_meat + avg_grains + AGEC + SEX, dat = scaled_df)


tab_model(bmi_scaled, w2h_scaled, waist_scaled)
  BMISC w 2 hratio PHDCMWBC
Predictors Estimates CI p Estimates CI p Estimates CI p
(Intercept) 0.06 0.03 – 0.09 <0.001 0.09 0.06 – 0.12 <0.001 0.36 0.33 – 0.39 <0.001
avg veges -0.05 -0.07 – -0.02 <0.001 -0.05 -0.07 – -0.03 <0.001 -0.04 -0.06 – -0.02 <0.001
avg fruit -0.05 -0.07 – -0.03 <0.001 -0.07 -0.09 – -0.05 <0.001 -0.05 -0.07 – -0.03 <0.001
avg dairy 0.02 -0.01 – 0.04 0.141 -0.01 -0.03 – 0.01 0.275 0.01 -0.01 – 0.03 0.255
avg meat 0.05 0.03 – 0.08 <0.001 0.03 0.01 – 0.05 0.007 0.03 0.01 – 0.06 0.002
avg grains -0.08 -0.10 – -0.05 <0.001 -0.05 -0.08 – -0.03 <0.001 -0.06 -0.09 – -0.04 <0.001
AGEC 0.17 0.15 – 0.19 <0.001 0.38 0.36 – 0.40 <0.001 0.28 0.26 – 0.30 <0.001
SEX [2] -0.12 -0.16 – -0.07 <0.001 -0.18 -0.22 – -0.14 <0.001 -0.69 -0.73 – -0.64 <0.001
Observations 7811 7811 7811
R2 / R2 adjusted 0.041 / 0.040 0.155 / 0.154 0.190 / 0.190

meat has the biggest effect on BMI overall effect and associated with increased BMI. fruit and veges associated with reduced waist circumference and waist to height ratio.